Preprocessing

Data is extracted and cleaned using Python in simulation.ipynb. The Python notebook is also used to generate a rudimentary config file, but some things (network connectivity) are specified manually.

R reads the cleaned data from the spreadsheet and uses this to: * Create a graph structure * Make the data into a JAGS-readable format with some post-pre-processing

# read in config data
configsheets = excel_sheets(configpath)
for (sheet in configsheets) {
  assign(sheet, read_excel(configpath, sheet))
}
stopifnot(!anyDuplicated(well_fp_map$well)) # each well cannot map to multiple flash plants

# read in regression data (plus extra)
regression_df = read_excel(regdatapath)
# dry_df = read_csv(extradatapath) %>% rename(mf = ip_sf)
dry_df = read_csv(extradatapath) %>% rename(well=facility)
## Parsed with column specification:
## cols(
##   facility = col_character(),
##   date = col_date(format = ""),
##   key = col_character(),
##   mf = col_double()
## )
regression_df = regression_df %>%
  mutate(date_numeric = as.numeric(date - base_datetime)) %>%
  mutate(date_numeric=ifelse(date_numeric>0, date_numeric, NA))  # remove dates before baseline
## Warning: package 'bindrcpp' was built under R version 3.4.4
dry_df = dry_df %>%
  filter(well %ni% unique(regression_df$well)) %>%
  mutate(date_numeric = as.numeric(as.POSIXct(date) - base_datetime)) %>%
  mutate(date_numeric=ifelse(date_numeric>0, date_numeric, NA))  # remove dates before baseline
well_fp_map = well_fp_map %>% select(well, fp) %>%drop_na()

# today_numeric = (Sys.time() - base_datetime) %>% as.numeric()
today_numeric = (today_datetime - base_datetime) %>% as.numeric()

# assign unique facility IDs
liq_wells = unique(regression_df$well) # aka production curve wells
dry_wells = unique(dry_df$well) # aka time series wells
map_wells = unique(well_fp_map$well)

no_data_wells = map_wells[!map_wells %in% c(liq_wells, dry_wells)]  # see which ones we're completely guessing for
no_map_wells = c(liq_wells, dry_wells)[!c(liq_wells, dry_wells) %in% map_wells]

well_names = c(liq_wells, dry_wells)
fp_names = c(well_fp_map$fp, fp_gen_map$fp, fp_constants$fp) %>% unique()
fluid_types = c('ip', 'lp', 'w')
gen_names = gen_constants$gen %>% unique() %>% sort()
ip_gen_names = paste(gen_names, 'ip', sep='_')
lp_gen_names = paste(gen_names, 'lp', sep='_')
w_gen_names = paste(gen_names, 'w', sep='_')
dummy_gen_names = c(ip_gen_names, lp_gen_names, w_gen_names) %>% sort()
all_names = c('DUMMY', well_names, fp_names, dummy_gen_names, gen_names)
ids = 1:length(all_names)
names(ids) = all_names

# add names in data with IDs
regression_df = regression_df %>% mutate(well_id=ids[well])
dry_df = dry_df %>% mutate(well_id=ids[well])
operating_conditions = operating_conditions %>% mutate(well_id=ids[well]) %>% rename(whp_pred=whp)
fp_constants = fp_constants %>% mutate(fp_id=ids[fp])
gen_constants = gen_constants %>% mutate(gen_id=ids[gen]) %>% select(-gen)
well_fp_map = well_fp_map %>% mutate(well_id=ids[well], fp_id=ids[fp]) %>% select(-c(well, fp))
fp_gen_map = fp_gen_map %>% mutate(fp_id=ids[fp], gen_ip_id=ids[gen_ip], gen_lp_id=ids[gen_lp], gen_w_id=ids[gen_w]) %>% select(-c(fp, gen_ip, gen_lp, gen_w))

Graph

# create connectivity matrix. i flows to j
# wells to FPs
v = matrix(0, nrow=length(ids), ncol=length(ids))
v[1,-1] = 1
for (i in 1:nrow(well_fp_map)) {
  id_i = well_fp_map[[i, 1]]
  id_j = well_fp_map[[i, 2]]
  v[id_i, id_j] = 1
}
# send ip/lp/w flows to dummy gens
for (i in 1:nrow(fp_gen_map)) {
  id_i = fp_gen_map[[i, 1]]
  for (j in 2:ncol(fp_gen_map)) {
    facility_j = names(ids)[fp_gen_map[[i, j]]]
    facility_dummy_j = paste(facility_j, fluid_types[j-1], sep='_')
    id_j = ids[facility_dummy_j]
    if (!is.na(id_j)) {
      v[id_i, id_j] = 1
    }
  }
}
# dummy gens to gens
for (i in 1:nrow(gen_constants)) {
  id_j = gen_constants$gen_id[i]
  facility_j = names(ids)[id_j]
  for (fluid in fluid_types) {
    facility_dummy_i = paste(facility_j, fluid, sep='_')
    id_i = ids[facility_dummy_i]
    v[id_i, id_j] = 1
  }
}

# convert form
m = matrix(0, nrow=nrow(v), ncol=max(colSums(v)))
rownames(m) = all_names
for (i in 1:nrow(v)) {
  for (j in 1:ncol(v)) {
    if (v[[i, j]]==1) {
      m[j, sum(m[j,]>0)+1] = i
    }
  }
}

# generate coordinates
dummy_locs = data.frame(name='DUMMY', x=-0.1, y=0)
well_locs = data.frame(name=well_names, x=0, y=seq(1, 1/(length(well_names)-1), length.out=length(well_names)))
fp_locs = data.frame(name=fp_names, x=1, y=seq(0, 1, length.out=length(fp_names)))
gen_dummy_locs = data.frame(name=dummy_gen_names, x=2, y=seq(0, 1, length.out=length(dummy_gen_names)))
gen_locs = data.frame(name=gen_names, x=2.5, y=seq(1/11, 10/11, length.out=length(gen_names)))
locs = rbind(dummy_locs, well_locs, fp_locs, gen_dummy_locs, gen_locs)
locs$id = ids[locs$name]
locs = locs %>% arrange(id)

g = graph_from_adjacency_matrix(v) %>%
  set_vertex_attr('label', value=all_names) %>%
  set_vertex_attr('x', value=as.vector(locs$x)) %>%
  set_vertex_attr('y', value=as.vector(locs$y)) %>%
  set_vertex_attr('label.degree', value=pi) %>%
  as.undirected()
V(g)$size = ifelse(V(g)$label %in% well_names, 5, 8)
V(g)$color = ifelse(V(g)$label %in% dry_wells, "red", "orange")
E(g)$color = "black"
E(g)[which(tail_of(g, E(g))$label=="DUMMY")]$color = "grey"

# png("../media/full_network.png")
par(mar=c(0,3,0,0), family="Times")
plot(g, vertex.label.dist=3,
     mark.groups = list(wells=ids[well_names], fps=ids[fp_names], gens=ids[gen_names]),
     mark.col = "#EEEEEE",
     mark.border = NA)
text(c(-1, -0.3, 0.4, 0.9), 1.15, c("Wells", "Flash plants", "Dummy gens", "Generators"), cex=1.25)

The dummy node is necessary because when indexing a subset of flows that go into a node, this subset cannot be empty. The dummy node has zero mass flowing out of it.

Data

regression_list = regression_df %>% select(well_id, whp, mf, date_numeric) %>% as.list()
dry_list = dry_df %>%
  rename(well_id_dry=well_id, mf_dry=mf, date_numeric_dry=date_numeric) %>% # use these in a different regression
  select(well_id_dry, mf_dry, date_numeric_dry) %>% as.list()
operating_conditions_list = operating_conditions %>% arrange(well_id) %>% select(whp_pred) %>% as.list()
fp_constants_list = as.list(fp_constants)
gen_constants_list = as.list(gen_constants %>% select(gen_id, factor))
facilities = data.frame(id=1:max(ids)) %>%
  full_join(operating_conditions %>% rename(id=well_id) %>% select(-well), by='id') %>%
  full_join(gen_constants %>% select(factor, id=gen_id), by='id') %>%
  full_join(fp_constants %>% rename(id=fp_id), by='id') %>%
  mutate(mf_pred=NA) %>%
  mutate(n_inflows=colSums(v))
## Warning: Column `id` has different attributes on LHS and RHS of join

## Warning: Column `id` has different attributes on LHS and RHS of join

## Warning: Column `id` has different attributes on LHS and RHS of join
well_ids = ids[well_names]
liq_well_ids = ids[liq_wells]
dry_well_ids = ids[dry_wells]
fp_ids = ids[fp_names]
ip_gen_ids = ids[ip_gen_names]
lp_gen_ids = ids[lp_gen_names]
w_gen_ids = ids[w_gen_names]
gen_ids = ids[gen_names]

# force all mass to IP steam
dry_fps = c("poi dry", "direct ip")
dry_fp_ids = ids[dry_fps]
facilities$hf_ip[facilities$id %in% dry_fp_ids] = 10
facilities$hfg_ip[facilities$id %in% dry_fp_ids] = 10
facilities_list = facilities %>% select(-id) %>% as.list()

# insert production curve predictions
prod = data.frame(whp_prod=seq(6, 16, length.out=10),
                  well_id_prod=ids[production_curve_well])
## Warning in data.frame(whp_prod = seq(6, 16, length.out = 10), well_id_prod
## = ids[production_curve_well]): row names were found from a short variable
## and have been discarded
ts = data.frame(date_numeric_ts=seq(max(dry_df$date_numeric)-30, max(dry_df$date_numeric)+30, length.out=10),
                well_id_ts=ids[ts_well])
## Warning in data.frame(date_numeric_ts = seq(max(dry_df$date_numeric) -
## 30, : row names were found from a short variable and have been discarded
prod_list = prod %>% as.list
ts_list = ts %>% as.list

# experimental TS data matrix for dry wells
ar_order = 1
empty = setNames(data.frame(matrix(ncol = length(all_names), nrow = 0)), all_names)
drymatrix = dry_df %>% 
  select(well, date_numeric, mf) %>% 
  spread(well, mf) %>% 
  select(-date_numeric)
drymatrix = empty %>%
  full_join(drymatrix) %>%
  as.matrix()
## Joining, by = c("wk116", "wk123", "wk229", "wk26a", "wk26b", "wk27", "wk28", "wk46", "wk55", "wk67", "wk70", "wk71", "wk72", "wk74", "wk76", "wk81", "wk83", "wk118", "wk216", "wk228", "wk233", "wk234", "wk236", "wk237", "wk238", "wk240", "wk241", "wk249", "wk25", "wk250", "wk251", "wk252", "wk604", "wk605", "wk606", "wk607", "wk610", "wk65")
ar_well_ids = which(complete.cases(t(drymatrix[1:(ar_order+1),])))
# which wells can we not use AR for
dry_no_ar_wells = dry_wells[!dry_well_ids %in% ar_well_ids]
dry_no_ar_well_ids = ids[dry_no_ar_wells]

# extend matrix for prediction
days_since_first = as.integer(today_datetime - as.POSIXct(min(dry_df$date)))
drymatrix = rbind(drymatrix, matrix(NA, nrow=days_since_first, ncol=ncol(drymatrix)))

# combine into one list
data = c(regression_list, dry_list, facilities_list, prod_list, ts_list,
         list(well_ids=well_ids, liq_well_ids=liq_well_ids, 
              dry_well_ids=dry_well_ids, dry_no_ar_well_ids=dry_no_ar_well_ids,
              fp_ids=fp_ids,
              gen_ids=gen_ids, ip_gen_ids=ip_gen_ids, lp_gen_ids=lp_gen_ids, w_gen_ids=w_gen_ids,
              today_numeric=today_numeric, m=m, dummy=1,
              ts=drymatrix, ts_ar=drymatrix, ts_ema=drymatrix, ar_well_ids=ar_well_ids))
data$whp_pred[is.na(data$whp_pred)] <- mean(data$whp_pred, na.rm=T)

# center covariates
mean_whp <- mean(data$whp, na.rm=T)
mean_date_numeric <- mean(data$date_numeric, na.rm=T)

data$whp_c <- data$whp - mean_whp
data$whp_pred_c <- data$whp_pred - mean_whp
data$whp_prod_c <- data$whp_prod - mean_whp
data$date_numeric_c <- data$date_numeric - mean_date_numeric
data$today_numeric_c <- data$today_numeric - mean_date_numeric
data$date_numeric_dry_c <- data$date_numeric_dry - mean_date_numeric
data$date_numeric_ts <- data$date_numeric_ts - mean_date_numeric

Model

code = "
data {
  D <- dim(ts)
}
model {
  ##############################################
  # fit individual regressions to liquid wells #
  ##############################################
  for (i in 1:length(mf)) {
    mu[i] <- Intercept[well_id[i]] + beta_whp[well_id[i]] * whp_c[i] + beta_date[well_id[i]] * date_numeric_c[i]
    mf[i] ~ dnorm(mu[i], tau[well_id[i]])
    mf_fit[i] ~ dnorm(mu[i], tau[well_id[i]])
    # mf_fit[i] ~ dnorm(mu[i]*measurement_error_factor[i], tau[well_id[i]])
    # measurement_error_factor[i] ~ dunif(0.9, 1.1)
  }
  # fit regression to dry wells
  for (i in 1:length(mf_dry)) {
    mu_dry[i] <- Intercept[well_id_dry[i]] + beta_date[well_id_dry[i]] * date_numeric_dry_c[i]
    mf_dry[i] ~ dnorm(mu_dry[i], tau[well_id_dry[i]])
    mf_dry_fit[i] ~ dnorm(mu_dry[i], tau[well_id_dry[i]])
    # measurement_error_factor_dry[i] ~ dunif(0.9, 1.1)
  }
  for (j in dry_well_ids) {
    Intercept[j] ~ dnorm(0, 1e-12)
    beta_date[j] ~ dnorm(0, 1e-12)
    tau[j] ~ dgamma(1e-12, 1e-12)
  }
  # experimental AR1 model for dry wells
  for (j in ar_well_ids) {
    for (t in 2:D[1]) {
      mu_ar[t,j] <- c_ar[j] + theta_ar[j]*ts_ar[t-1,j]
      ts_ar[t,j] ~ dnorm(mu_ar[t,j], tau_ar[j]) T(0,)
    }
    theta_ar[j] ~ dnorm(0, 1e-12)
    c_ar[j] ~ dnorm(0, 1e-12)
    tau_ar[j] ~ dgamma(1e-12, 1e-12)
  }
  # experimental EMA model (don't use)
  for (j in ar_well_ids) {
    for (t in 2:D[1]) {
      mu_ema[t,j] <- alpha*mu_ema[t-1,j] + (1-alpha)*ts_ema[t-1,j]
      ts_ema[t,j] ~ dnorm(mu_ema[t,j], tau_ema[j]) T(0,)
    }
    mu_ema[1,j] <- ts_ema[1,j]
    theta_ema[j] ~ dnorm(0, 1e-12)
    c_ema[j] ~ dnorm(0, 1e-12)
    tau_ema[j] ~ dgamma(1e-12, 1e-12)
  }
  alpha ~ dbeta(0.5, 0.5)

  # HIERARCHICAL
  # fills in for any missing wells
  for (j in liq_well_ids) {
    Intercept[j] ~ dnorm(mu_Intercept, tau_Intercept)
    beta_whp[j] ~ dnorm(mu_beta_whp, tau_beta_whp)
    beta_date[j] ~ dnorm(mu_beta_date, tau_beta_date)
    tau[j] ~ dgamma(1e-12, 1e-12)
    sd[j] <- 1/max(sqrt(tau[j]), 1e-12)
  }

  # fill in any missing data
  for (i in 1:length(mf)) {
    date_numeric_c[i] ~ dnorm(mu_date_numeric, tau_date_numeric)
  }
  mu_date_numeric ~ dnorm(0, 1e-12)
  tau_date_numeric ~ dnorm(1e-12, 1e-12)
  
  # set hyperparameters
  mu_Intercept ~ dnorm(0, 1e-12)
  mu_beta_whp ~ dnorm(0, 1e-12)
  mu_beta_date ~ dnorm(0, 1e-12)
  tau_Intercept ~ dgamma(1e-12, 1e-12)
  tau_beta_whp ~ dgamma(1e-12, 1e-12)
  tau_beta_date ~ dgamma(1e-12, 1e-12)

  #####################################
  # production curve for verification #
  #####################################
  for (i in 1:length(whp_prod)) {
    mu_prod[i] <- Intercept[well_id_prod[i]] + beta_whp[well_id_prod[i]] * whp_prod_c[i] + beta_date[well_id_prod[i]] * today_numeric_c
    mf_prod[i] ~ dnorm(mu_prod[i], tau[well_id_prod[i]])
  }
  for (i in 1:length(date_numeric_ts)) {
    mu_ts[i] <- Intercept[well_id_ts[i]] + beta_date[well_id_ts[i]] * date_numeric_ts[i]
    mf_ts[i] ~ dnorm(mu_ts[i], tau[well_id_ts[i]])
  }

  #########################################################
  # simple model to fill in missing FP enthalpy constants #
  #########################################################
  for (i in fp_ids) {
    # missing fp constants
    hf_ip[i] ~ dgamma(param[1], param[7])
    hg_ip[i] ~ dgamma(param[2], param[8])
    hfg_ip[i] ~ dgamma(param[3], param[9])
    hf_lp[i] ~ dgamma(param[4], param[10])
    hg_lp[i] ~ dgamma(param[5], param[11])
    hfg_lp[i] ~ dgamma(param[6], param[12])
  }
  for (i in c(1, well_ids)) { h[i] ~ dgamma(param[13], param[14]) } # missing well constants
  for (i in 1:14) { param[i] ~ dgamma(1e-12, 1e-12) }               # uniform priors

  ########################################
  # make predictions (the stuff we want) #
  ########################################
  mf_pred[dummy] <- 0  # dummy well
  ip_sf[dummy] <- 0
  lp_sf[dummy] <- 0
  wf[dummy] <- 0
  
  # use production curve
  for (j in liq_well_ids) {
    mf_pred[j] <- Intercept[j] + beta_whp[j] * whp_pred_c[j] + beta_date[j] * today_numeric_c
  }
  # use naive TS reg
  for (j in dry_well_ids) { # dry_no_ar_well_ids) { 
    mf_pred[j] <- Intercept[j] + beta_date[j] * today_numeric_c
  }
  # use AR(1)
  # for (j in ar_well_ids) {
  #   mf_pred[j] <- mu_ar[D[1], j]
  # }

  for (i in fp_ids) {
    mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
    h[i] <- sum(mf_pred[m[i, 1:n_inflows[i]]] * h[m[i, 1:n_inflows[i]]]) / ifelse(mf_pred[i]!=0, mf_pred[i], 1)

    ip_sf[i] <- min(max((h[i] - hf_ip[i]), 0) / hfg_ip[i], 1) * mf_pred[i]
    lp_sf[i] <- min(max((min(hf_ip[i], h[i]) - hf_lp[i]), 0) / hfg_lp[i], 1) * (mf_pred[i] - ip_sf[i])

    total_sf[i] <- ip_sf[i] + lp_sf[i]
    wf[i] <- mf_pred[i] - total_sf[i]
  }
  # dummy gens and actual gens
  for (i in ip_gen_ids) { mf_pred[i] <- sum(ip_sf[m[i, 1:n_inflows[i]]]) }
  for (i in lp_gen_ids) { mf_pred[i] <- sum(lp_sf[m[i, 1:n_inflows[i]]]) }
  for (i in w_gen_ids) { mf_pred[i] <- sum(wf[m[i, 1:n_inflows[i]]]) }
  for (i in gen_ids) {
    mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
    power[i] <- mf_pred[i] / mu_factor[i]
    mu_factor[i] ~ dunif(0.95*factor[i], 1.05*factor[i])  # uncertainty from email
  }
  total_power <- sum(power[gen_ids])
}
"

vars =  c('mf_fit',
          'mf_dry_fit',
          'mf_ts',
          'mf_prod',
          'mf_pred',
          'beta_date',
          'sd',
          'power',
          'total_sf',
          'mu_ar',
          'mu_ema',
          'alpha',
          # paste0('m_ts[', ar_well_ids, ']'),
          paste0('h[', fp_ids, ']'),
          paste0('mu_', c('Intercept', 'beta_whp', 'beta_date')),
          'total_power')
n_chains = 2
burn_in = 100
n_steps = 1000

model = jags.model(textConnection(code), data, n.chains=n_chains)
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "whp" in data
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "date_numeric" in data
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "date_numeric_dry" in data
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "whp_pred" in data
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "fp" in data
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "limit" in data
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "dry_no_ar_well_ids" in data
## Warning in jags.model(textConnection(code), data, n.chains = n_chains):
## Unused variable "today_numeric" in data
## Compiling data graph
##    Resolving undeclared variables
##    Allocating nodes
##    Initializing
##    Reading data back into data table
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 4800
##    Unobserved stochastic nodes: 4623
##    Total graph size: 31557
## 
## Initializing model
update(model, burn_in)
out = coda.samples(model, n.iter=round(n_steps/n_chains), variable.names=vars)
outmatrix = as.matrix(out)
outframe = as.data.frame(outmatrix) %>%
  gather(key=facility, value=value) %>%
  mutate(variable=gsub("\\[.*$", "", facility), facility=parse_number(facility, na=c("NA")))
## Warning: 5000 parsing failures.
## row # A tibble: 5 x 4 col     row   col expected actual expected   <int> <int> <chr>    <chr>  actual 1     1    NA a number alpha  row 2     2    NA a number alpha  col 3     3    NA a number alpha  expected 4     4    NA a number alpha  actual 5     5    NA a number alpha 
## ... ................. ... ............................. ........ ............................. ...... ............................. ... ............................. ... ............................. ........ ............................. ...... .............................
## See problems(...) for more details.
outframe$facility = factor(names(ids)[outframe$facility])

Posteriors

# for over-plotting
special_wells = c('wk124', 'wk242', 'wk263', 'wk270', 'wk271')

g1 = ggplot(outframe %>% 
              filter(facility %in% well_names, variable=="mf_pred", value>0) %>%
              mutate(source = ifelse(facility %in% dry_wells, "PI time series", "Production curve")), 
            aes(x=value, fill=facility)) +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + xlim(0, NA) +
  facet_grid(source~.) +
  labs(title="Posterior Well Mass Flows for 2018", x="Mass flow (T/h)", y="Density", fill="Facility") +
  ggsave('../media/mf_wells.png', width=6, height=4, units='in')
  
g2 = ggplot(outframe %>% filter(variable=="beta_date") %>% filter(facility %in% special_wells), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) +labs(title="Posterior Decline Rate of Test Data", x="beta_date (T/h/Bar)", y="Density", fill="Facility") +
  ggsave('../media/beta_date.png', width=6, height=4, units='in')

g3 = ggplot(outframe %>% filter(facility %in% fp_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + xlim(0, NA) + 
  labs(title="Posterior Flash Plant Mass Flows for 2018", x="Mass flow (T/h)") +
  ggsave('../media/mf_fps.png', width=6, height=4, units='in')

g4 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + xlim(0, NA) + 
  labs(title="Posterior Generator Mass Flows for 2018", x="Mass flow (T/h)", y="Scaled density", fill="Facility") +
  ggsave('../media/mf_gens.png', width=6, height=4, units='in')

g5 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="power", value>0), aes(x=value, fill=facility)) +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + xlim(0, NA) + 
  labs(title="Posterior Generator Power Output for 2018", x="Power (MW)", y="Scaled density", fill="Facility") +
  ggsave('../media/power_gens.png', width=6, height=4, units='in')

tb6 <- outframe %>% filter(variable=="sd") %>% select(facility, value) %>%
  mutate(well=factor(facility)) %>%
  group_by(well) %>%
  summarise(Mean = mean(value), 
            `Lower 2.5%` = quantile(value, 0.025), 
            `Upper 97.5%` = quantile(value, 0.975)) %>%
  mutate_if(is.numeric, round, 3) %>%
  inner_join(regression_df %>% mutate(well=factor(names(ids)[well_id])) %>% group_by(well) %>% summarise(n=n()), by="well")
g6 = ggplot(outframe %>% filter(variable=="sd") %>% filter(facility %in% special_wells), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) + coord_cartesian(xlim=c(0, max(tb6$`Upper 97.5%`))) +
  labs(title="Posterior Flow Deviation Estimates", x="Standard deviation", y="Density", fill="Facility") +
  ggsave('../media/standard_deviation.png', width=6, height=4, units='in')

ggplotly(g1, tooltip=c('facility', 'value'))
ggplotly(g2, tooltip=c('facility', 'value'))
ggplotly(g3, tooltip=c('facility', 'value'))
ggplotly(g4, tooltip=c('facility', 'value'))
ggplotly(g5, tooltip=c('facility', 'value'))
ggplotly(g6, tooltip=c('facility', 'value'))
# g1; g2; g3; g4; g5; g6

tb2 <- outframe %>% filter(variable=="beta_date") %>% select(facility, value) %>%
  mutate(well=factor(facility)) %>%
  group_by(well) %>%
  summarise(Mean = mean(value), 
            `Lower 2.5%` = quantile(value, 0.025), 
            `Upper 97.5%` = quantile(value, 0.975)) %>%
  mutate_if(is.numeric, round, 3) %>%
  inner_join(regression_df %>% mutate(well=factor(names(ids)[well_id])) %>% group_by(well) %>% summarise(n=n()), by="well")
## Warning: Column `well` joining factors with different levels, coercing to
## character vector
tb2[tb2$well %in% special_wells,] %>% knitr::kable()
well Mean Lower 2.5% Upper 97.5% n
wk124 -0.062 -0.085 -0.039 31
wk242 0.019 -0.009 0.047 28
wk263 -0.014 -0.021 -0.007 34
wk270 -0.083 -0.207 0.027 5
wk271 -0.071 -0.196 0.032 6
tb6[tb6$well %in% special_wells,] %>% knitr::kable()
well Mean Lower 2.5% Upper 97.5% n
wk124 22.816 17.403 30.203 31
wk242 148.061 111.727 199.446 28
wk263 13.987 11.184 18.154 34
wk270 35.757 13.654 93.629 5
wk271 107.769 51.155 253.192 6
# Hyper- parameters
hp.df <- outframe %>% filter(variable %in% c("mu_beta_date", "mu_beta_whp", "mu_Intercept"))
hp.quantiles <- hp.df %>%
  group_by(variable) %>%
  summarise(`Lower 2.5%` = quantile(value, 0.025),
            `Upper 97.5%` = quantile(value, 0.975)) %>%
  gather(key='Quantile', value='value', `Lower 2.5%`, `Upper 97.5%`)

coefplot = ggplot(hp.df, aes(x=value, fill=variable)) +
  facet_wrap(~variable, nrow=3, scales = "free") +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + 
  geom_vline(data=hp.quantiles, aes(xintercept=value, color=Quantile)) +
  labs(title="Posterior Regression Coefficients", x="Value", y="Density", fill="Variable")
ggplotly(coefplot)

Regression Fits

prod = as.data.frame(outmatrix) %>%
  select(contains('prod')) %>%
  gather(key=facility, value=value) %>%
  mutate(which=parse_number(facility)) %>%
  mutate(whp=data$whp_prod[which]) %>%
  rename(mf=value) %>%
  group_by(whp) %>%
  summarise(lower=quantile(mf, 0.025),
            upper=quantile(mf, 0.975),
            mean=mean(mf))

plotdata = regression_df %>%
  filter(well_id==ids[production_curve_well]) %>%
  mutate(datetime = factor(as.Date(date)))

ggplot(prod, aes(x=whp)) +
  geom_line(aes(y=mean, lty="Bayesian Regression"), color='red') +
  # geom_line(data=reglines, aes(y=mf, group=datetime, lty="OLS Regression")) +
  geom_ribbon(aes(ymin=lower, ymax=upper), alpha=0.25, fill='red') +
  geom_point(data=plotdata, aes(y=mf, color=date)) +
  labs(title=paste("Fitted Production Curve for", production_curve_well), x="Well-head pressure (bar)", y="Mass flow (T/h)", color="Date", linetype="") +
  coord_cartesian(xlim=c(min(plotdata$whp)*0.875,max(plotdata$whp)*1.125), ylim=c(0,max(plotdata$mf)*1.125)) +
  ggsave('../media/production_curve.png', width=6, height=4, units='in')

ts_out = as.data.frame(outmatrix) %>%
  select(contains('ts')) %>%
  gather() %>%
  mutate(which=parse_number(key)) %>%
  mutate(date_numeric=data$date_numeric_ts[which] + mean_date_numeric) %>%
  rename(mf=value) %>%
  group_by(date_numeric) %>%
  summarise(lower=quantile(mf, 0.025),
            upper=quantile(mf, 0.975),
            mean=mean(mf))

tsplotdata = dry_df %>%
  filter(well_id==ids[ts_well]) %>%
  mutate(datetime = factor(as.Date(date)))

ggplot(ts_out, aes(x=date_numeric)) +
  geom_line(aes(y=mean, lty="Bayesian Regression"), color='red') +
  geom_ribbon(aes(ymin=lower, ymax=upper), alpha=0.25, fill='red') +
  geom_line(data=tsplotdata, aes(y=mf, color=date)) +
  ylim(0, NA) +
  labs(title=paste("Fitted Linear Time Series for", ts_well), x="Days since baseline (2000)", y="Mass flow (T/h)", color="Date", linetype="") +
  ggsave('../media/dry_time_series.png', width=6, height=4, units='in')

# experimental AR1 time series
ar_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ar")) %>%
  gather() %>%
  mutate(t = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)"))) %>%
  mutate(which = as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))) %>%
  mutate(facility = names(ids)[which]) %>%
  select(facility, t, value) %>%
  group_by(facility, t) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975))

actualts = drymatrix %>% as.data.frame() %>%
  mutate(t = 1:nrow(drymatrix)) %>%
  gather(key="facility", value="value", -t) %>%
  filter(facility %in% names(ids)[ar_well_ids])

arplot = ggplot(ar_fit, aes(x=t, y=mean, fill=facility, color=facility)) +
  geom_line(data=actualts, aes(y=value)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 50)) +
  labs(title="AR(1) Experiment", x="Days since first date", y="Mass flow (T/h)") +
  ggsave('../media/ar_experiment.png', width=6, height=4, units='in')
## Warning: Removed 1082 rows containing missing values (geom_path).
# experimental EMA time series
ema_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ema")) %>%
  gather() %>%
  mutate(t = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)"))) %>%
  mutate(which = as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))) %>%
  mutate(facility = names(ids)[which]) %>%
  select(facility, t, value) %>%
  group_by(facility, t) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975))

emaplot = ggplot(ema_fit, aes(x=t, y=mean, fill=facility, color=facility)) +
  geom_line(data=actualts, aes(y=value)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 50)) +
  labs(title="EWMA Experiment", x="Days since first date", y="Mass flow (T/h)") +
  ggsave('../media/ema_experiment.png', width=6, height=4, units='in')
## Warning: Removed 1082 rows containing missing values (geom_path).
ggplotly(arplot)
ggplotly(emaplot)

Trace plots

trace1 <- outframe %>%
  filter(variable=='mf_pred' & facility=='wk256')
trace2 <- outframe %>%
  filter(variable=='total_power')
trace3 <- outframe %>%
  filter(variable=='mu_Intercept')
traceplot = ggplot(trace1, aes(y=value, color=variable)) +
  geom_line(aes(x=as.numeric(row.names(trace1)))) +
  geom_line(data=trace2, aes(x=as.numeric(row.names(trace1)))) +
  geom_line(data=trace3, aes(x=as.numeric(row.names(trace1)))) +
  labs(title="Trace Plots (Single chain)", x="Iteration", y="Node value", color="Variable") +
  ggsave('../media/trace_plots.png', width=6, height=4, units='in')
ggplotly(traceplot)

Goodness of fit (OLS regression)

liq_fit = as.data.frame(outmatrix) %>%
  select(contains('mf_fit')) %>%
  gather(key='index', value='fitted') %>%
  mutate(index=as.integer(parse_number(index))) %>%
  group_by(index) %>%
  summarise(lower=quantile(fitted, 0.025),
            upper=quantile(fitted, 0.975),
            Fitted=mean(fitted),
            std=sd(fitted)) %>%
  cbind(regression_df) %>%
  mutate(`Standardised residual` = (Fitted-mf)/std,
         Well = factor(names(ids[well_id])),
         Observed = mf) %>%
  gather(key="key", value="value", `Standardised residual`, Observed) %>%
  select(Well, key, Fitted, value)

diagplot = ggplot(liq_fit, aes(x=Fitted, y=value)) +
  geom_point(aes(color=Well, shape=Well)) + scale_shape_manual(values = 1:length(levels(liq_fit$Well))) +
  geom_smooth(color='black') +
  facet_grid(key~., scales="free_y", switch="y") +
  geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  labs(title="Diagnostic Plots", x="Fitted mass flow (T/h)", y="") +
  ggsave('../media/diagnostics.png', width=6, height=4, units='in')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplotly(diagplot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
stdres_min = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% min()
stdres_max = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% max()
resdesplot = ggplot(liq_fit %>% filter(key=="Standardised residual"), aes(x=value)) +
  geom_density(fill="red", alpha=0.5, color=NA) +
  geom_line(data=data.frame(x=seq(stdres_min, stdres_max, length.out=100)), aes(x=x, y=dnorm(x)))
ggplotly(resdesplot)

Constraint Violations

sf.df <- outframe %>% 
  filter(str_detect(variable, "total_sf") & value > 0) %>% 
  droplevels()
limits = fp_constants %>%
  mutate(facility = names(ids)[fp_id]) %>%
  select(facility, limit) %>%
  drop_na()

limitplot = ggplot(sf.df, aes(x=value, fill=facility)) +
  facet_grid(facility~., scales = "free_y", switch="y") +
  geom_density(color=NA, alpha=0.5) +
  geom_vline(data=limits, aes(xintercept=limit, color=facility)) +
  labs(title="Posterior Flash Plant Mass Flows", x="Steam flow (T/h)", y="Density", fill="Flash plant", color="Steam flow limit") +
  ggsave('../media/constraints.png', width=6, height=4, units='in')
ggplotly(limitplot)

Diagnostics

random_var_ix = sample.int(ncol(outmatrix), 100) # 100 random var because it takes too long
geweke.diag(out[,1:100])
## [[1]]
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##          alpha   beta_date[2]   beta_date[3]   beta_date[4]   beta_date[5] 
##       0.952808      -3.882998       0.136275       1.380467      -1.157648 
##   beta_date[6]   beta_date[7]   beta_date[8]   beta_date[9]  beta_date[10] 
##       0.593783       1.357520      -1.966898      -3.509355      -0.312900 
##  beta_date[11]  beta_date[12]  beta_date[13]  beta_date[14]  beta_date[15] 
##       0.148635      -1.519220      -0.311158      -0.107042      -2.133065 
##  beta_date[16]  beta_date[17]  beta_date[18]  beta_date[19]  beta_date[20] 
##       0.673966       0.374221       1.150058       0.748739       0.551063 
##  beta_date[21]  beta_date[22]  beta_date[23]  beta_date[24]  beta_date[25] 
##      -0.593603       0.588185       0.106633       1.676510       1.086270 
##  beta_date[26]  beta_date[27]  beta_date[28]  beta_date[29]  beta_date[30] 
##       0.018949      -5.867607      -0.647338      -0.303219       8.064064 
##  beta_date[31]  beta_date[32]  beta_date[33]  beta_date[34]  beta_date[35] 
##      -2.967810      -6.160783       4.450616      -8.445947      -1.137245 
##  beta_date[36]  beta_date[37]  beta_date[38]  beta_date[39]  beta_date[40] 
##       1.895352     -10.960913      -5.790536      -0.013814       0.261345 
##  beta_date[41]  beta_date[42]  beta_date[43]  beta_date[44]  beta_date[45] 
##      21.211797      -3.211275       0.453401       0.583987      -0.898446 
##  beta_date[46]  beta_date[47]  beta_date[48]  beta_date[49]  beta_date[50] 
##       0.975961       1.751256      11.515047       2.236298      -1.752002 
##  beta_date[51]  beta_date[52]  beta_date[53]  beta_date[54]  beta_date[55] 
##      -0.526641     -20.849932       0.717295       1.643243      -5.295595 
##  beta_date[56]  beta_date[57]  beta_date[58]  beta_date[59]  beta_date[60] 
##       4.062297      -6.711596       6.048649      -7.083123       5.491588 
##  beta_date[61]  beta_date[62]  beta_date[63]          h[64]          h[65] 
##     -14.323486      -7.095878       0.979930      -0.555846       0.430803 
##          h[66]          h[67]          h[68]          h[69]          h[70] 
##      -0.055280      -0.738454       0.613307      -0.756644       0.172275 
##          h[71]          h[72]          h[73]          h[74]  mf_dry_fit[1] 
##      -0.204978       0.272638      -0.343717            NaN      -0.440623 
##  mf_dry_fit[2]  mf_dry_fit[3]  mf_dry_fit[4]  mf_dry_fit[5]  mf_dry_fit[6] 
##       0.263039      -1.106354       1.998523       0.775413      -1.093132 
##  mf_dry_fit[7]  mf_dry_fit[8]  mf_dry_fit[9] mf_dry_fit[10] mf_dry_fit[11] 
##      -0.307279       0.467322       0.036460       1.533074      -0.551737 
## mf_dry_fit[12] mf_dry_fit[13] mf_dry_fit[14] mf_dry_fit[15] mf_dry_fit[16] 
##      -0.176634      -1.441030      -1.144953      -0.008069       0.511269 
## mf_dry_fit[17] mf_dry_fit[18] mf_dry_fit[19] mf_dry_fit[20] mf_dry_fit[21] 
##       1.204811       0.076522       0.286230      -2.382012      -0.238553 
## mf_dry_fit[22] mf_dry_fit[23] mf_dry_fit[24] mf_dry_fit[25] mf_dry_fit[26] 
##      -0.493573       1.553093       0.397984      -0.083341       0.107929 
## 
## 
## [[2]]
## 
## Fraction in 1st window = 0.1
## Fraction in 2nd window = 0.5 
## 
##          alpha   beta_date[2]   beta_date[3]   beta_date[4]   beta_date[5] 
##       -0.95709        1.04328        0.96737       -1.10546       -1.16558 
##   beta_date[6]   beta_date[7]   beta_date[8]   beta_date[9]  beta_date[10] 
##       -0.14319       -2.56538       -0.77801       -1.64584       -1.50482 
##  beta_date[11]  beta_date[12]  beta_date[13]  beta_date[14]  beta_date[15] 
##        0.57578       -0.62893        0.36000        0.28266       -1.99767 
##  beta_date[16]  beta_date[17]  beta_date[18]  beta_date[19]  beta_date[20] 
##       -0.27262        1.82941       -0.22753       -0.14895        1.46035 
##  beta_date[21]  beta_date[22]  beta_date[23]  beta_date[24]  beta_date[25] 
##        2.54816        0.20595        0.28710        0.06958       -0.19985 
##  beta_date[26]  beta_date[27]  beta_date[28]  beta_date[29]  beta_date[30] 
##        1.81657        0.30120        1.18317       -7.91374        0.45091 
##  beta_date[31]  beta_date[32]  beta_date[33]  beta_date[34]  beta_date[35] 
##        3.53159       -2.28784       10.86865        1.82641       12.61772 
##  beta_date[36]  beta_date[37]  beta_date[38]  beta_date[39]  beta_date[40] 
##       -0.17237       -1.58323      -10.33315        0.22985        3.49113 
##  beta_date[41]  beta_date[42]  beta_date[43]  beta_date[44]  beta_date[45] 
##       -7.23616       -0.40148        2.87105        6.07667       -4.89801 
##  beta_date[46]  beta_date[47]  beta_date[48]  beta_date[49]  beta_date[50] 
##      -13.72143       -5.85147       -7.07283       -2.07676       -0.30025 
##  beta_date[51]  beta_date[52]  beta_date[53]  beta_date[54]  beta_date[55] 
##        0.83505        5.61170       -0.13402      -11.83544       -1.78355 
##  beta_date[56]  beta_date[57]  beta_date[58]  beta_date[59]  beta_date[60] 
##        7.48855        4.66103        6.28791        1.79025       -4.04766 
##  beta_date[61]  beta_date[62]  beta_date[63]          h[64]          h[65] 
##       -8.84464      -11.40829       -3.04260       -0.08750        0.18378 
##          h[66]          h[67]          h[68]          h[69]          h[70] 
##       -1.61666       -0.07449        1.01967       -0.48977       -0.62089 
##          h[71]          h[72]          h[73]          h[74]  mf_dry_fit[1] 
##       -0.17065        0.72195       -0.37794            NaN       -1.20271 
##  mf_dry_fit[2]  mf_dry_fit[3]  mf_dry_fit[4]  mf_dry_fit[5]  mf_dry_fit[6] 
##       -0.21124       -1.14957       -0.28674       -1.53528       -0.14007 
##  mf_dry_fit[7]  mf_dry_fit[8]  mf_dry_fit[9] mf_dry_fit[10] mf_dry_fit[11] 
##       -0.87801        0.64351       -1.67410       -0.91101        0.19235 
## mf_dry_fit[12] mf_dry_fit[13] mf_dry_fit[14] mf_dry_fit[15] mf_dry_fit[16] 
##        1.20617       -0.84323        0.66160       -2.79829       -0.16462 
## mf_dry_fit[17] mf_dry_fit[18] mf_dry_fit[19] mf_dry_fit[20] mf_dry_fit[21] 
##       -1.93559       -0.80735        1.86436        1.23058       -1.31262 
## mf_dry_fit[22] mf_dry_fit[23] mf_dry_fit[24] mf_dry_fit[25] mf_dry_fit[26] 
##       -2.52667       -0.32557       -0.13842        2.59120        0.81179
gelman.diag(out[,c(paste0('mf_pred[', 8:9, ']'), 'beta_date[9]', 'mu_beta_whp', 'mu_beta_date', 'mu_Intercept', 'total_power')])[[1]] %>% as.data.frame() %>% round(2)
Point est. Upper C.I.
mf_pred[8] 1 1.02
mf_pred[9] 1 1.00
beta_date[9] 1 1.00
mu_beta_whp 1 1.00
mu_beta_date 1 1.01
mu_Intercept 1 1.00
total_power 1 1.01
raftery.diag(out[,c(paste0('mf_pred[', 8:9, ']'), 'beta_date[9]', 'mu_beta_whp', 'mu_beta_date', 'mu_Intercept', 'total_power')])
## [[1]]
## 
## Quantile (q) = 0.025
## Accuracy (r) = +/- 0.005
## Probability (s) = 0.95 
## 
## You need a sample size of at least 3746 with these values of q, r and s
## 
## [[2]]
## 
## Quantile (q) = 0.025
## Accuracy (r) = +/- 0.005
## Probability (s) = 0.95 
## 
## You need a sample size of at least 3746 with these values of q, r and s
testx = seq(0, 10, 1)
testy = 2*testx
N = length(testx)
testdata = list(x=testx, y=testy, N=N)

testcode = "
model {
  for (i in 1:N) {
    mu[i] <- alpha + beta * x[i]
    y[i] ~ dnorm(mu[i], tau)
  }
  alpha ~ dnorm(0, 1e-8)
  beta ~ dnorm(0, 1e-8)
  tau ~ dgamma(1e-8, 1e-8)

  z ~ dnorm(0, 1)
  trunc <- max(min(z, 1), -1)
}
"

testmodel = jags.model(textConnection(testcode), testdata, n.chains=1)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 11
##    Unobserved stochastic nodes: 4
##    Total graph size: 55
## 
## Initializing model
update(testmodel, burn_in)
testout = coda.samples(testmodel, n.iter=10000, variable.names=c("beta", "alpha", "z", "trunc")) %>%
  as.matrix() %>%
  as.data.frame() %>%
  gather()
ggplot(testout, aes(x=value, fill=key)) +
  geom_histogram(color=NA, alpha=0.5) +
  facet_grid(key~., scales="free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.